Load in data

Air temperature (C), Wind Speed (m/s), Wind Direction (degrees from; 0=from N, 90=from E, etc.)

#Read in the file list from folder
filelist <- list.files("2011-2021", full.names = TRUE)
filelist <- sort(filelist)
filelist
##  [1] "2011-2021/chi2011.04t.avg.txt" "2011-2021/chi2012.04t.avg.txt"
##  [3] "2011-2021/chi2013.04t.avg.txt" "2011-2021/chi2014.04t.avg.txt"
##  [5] "2011-2021/chi2015.04t.avg.txt" "2011-2021/chi2016.04t.avg.txt"
##  [7] "2011-2021/chi2017.04t.avg.txt" "2011-2021/chi2018.04t.avg.txt"
##  [9] "2011-2021/chi2019.04t.avg.txt" "2011-2021/chi2020.04t.avg.txt"
## [11] "2011-2021/chi2021.04t.avg.txt"
#Initialize the first file for appending
r <- read.csv(filelist[1], sep="", header = TRUE)
head(r, 10)
##    DOY    WS  WD    AT   n
## 1    1 11.89 234  1.34 288
## 2    2  9.66 250 -8.07 288
## 3    3  8.11 214 -2.25 288
## 4    4  7.93 257 -2.40 288
## 5    5  5.41 236 -6.81 288
## 6    6  7.74 288 -2.97 288
## 7    7  6.85 295 -7.22 288
## 8    8  9.71 310 -8.74 288
## 9    9  5.05 312 -6.08 288
## 10  10  3.46 144 -3.00 250
#Loop to read in all files 
for (i in 2:length(filelist)){
  ri <- suppressMessages(read.csv(filelist[i], sep=""))
  r <- rbind(r, ri)
} 
df <- r
df$Date = seq(as.Date("2011-01-01"),as.Date("2011-01-01") + dim(r)[1] - 1,by = 1)
df$Month <- as.numeric(format(df$Date,'%m'))
df$Year <- as.numeric(format(df$Date,'%Y'))
tail(df)
##      DOY   WS  WD   AT   n       Date Month Year
## 4001 360 4.32  44 3.70 719 2021-12-14    12 2021
## 4002 361 9.58 127 4.48 719 2021-12-15    12 2021
## 4003 362 7.18 114 1.93 719 2021-12-16    12 2021
## 4004 363 5.93 288 2.11 719 2021-12-17    12 2021
## 4005 364 4.87  95 1.38 719 2021-12-18    12 2021
## 4006 365 4.81 180 2.89 715 2021-12-19    12 2021

Visualization

Wind Speed (m/s)

#As TS
ggplot(df, aes(x = DOY, y = WS, color = Month)) +
  geom_line()

#Distribution of all 
hist(df$WS, main = 'Histogram of Wind Speed (m/s)', xlab = 'Wind Speed (m/s)')

#Box plot by Month
boxplot(WS~Month,df, ylab = 'Wind Speed (m/s)')

### Wind Direction

#WD visualization
windrose(speed = df$WS,
                 direction =df$WD,
                 speed_cuts = seq(0,25,5),
                 ggtheme='minimal')

#As TS
ggplot(df, aes(x = DOY, y = WD, color = Month)) +
  geom_line()

#General Distribution
hist(df$WD, main = 'Histogram of Wind Direction', xlab = 'Wind Direction')

#Box plot by Month
boxplot(WD~Month,df, ylab = 'Wind Direction')

Air Tempreture

#As TS
ggplot(df, aes(x = DOY, y = AT, color = Month)) +
  geom_line()

#General Distribution
hist(df$AT, main = 'Histogram of Air Tempreture', xlab = 'Air Tempreture')

#Box plot by Month
boxplot(AT~Month,df, ylab = 'Air Tempreture')

We see obvious outliers from the line plot and histogram on 2019 May.

#qplots
qplot(WD,WS,data=df,color=Month,geom = c("point","smooth"),facets = .~Month)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

qplot(AT,WS,data=df,color=Month,geom = c("point","smooth"),facets = .~Month)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

cor(df$WS,df$AT)
## [1] -0.1935277
cor(df$WS,df$WD)
## [1] 0.1970362

A value of 0.21, -0.22 shows there is a positive correlation between two variables, but it is weak and likely unimportant.

Data Aggregation

We wanted to explore wind speed change monthly, hence we aggregate our data into monthly averages.

df.agg <- df %>% group_by(Year, Month) %>% summarize(m_ws = mean(WS), m_at = mean(AT), m_wd = mean(WD))
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
#Rounding 
df.agg$m_ws <- round(df.agg$m_ws, digits = 5)
df.agg$m_at <- round(df.agg$m_at, digits = 5)
df.agg$m_wd <- round(df.agg$m_wd, digits = 5)
#Reset row index
df.agg$Order <- 1:nrow(df.agg)
df.agg
## # A tibble: 132 × 6
## # Groups:   Year [11]
##     Year Month  m_ws  m_at  m_wd Order
##    <dbl> <dbl> <dbl> <dbl> <dbl> <int>
##  1  2011     1  7.18 -5.03  239.     1
##  2  2011     2  8.34 -2.38  192.     2
##  3  2011     3  6.97  1.91  138      3
##  4  2011     4  7.97  7.60  177.     4
##  5  2011     5  7.75 12.2   189.     5
##  6  2011     6  6.24 18.7   201.     6
##  7  2011     7  4.81 24.2   169.     7
##  8  2011     8  5.68 22.9   163.     8
##  9  2011     9  7.37 16.4   180.     9
## 10  2011    10  7.33 13.3   202.    10
## # … with 122 more rows

Visualization of Aggregated Data

Wind Speed (m/s)

#As TS
ggplot(df.agg, aes(x = Order, y = m_ws, color = Month)) +
  geom_line()

#Distribution of all 
hist(df.agg$m_ws, main = 'Histogram of Wind Speed (m/s)', xlab = 'Wind Speed (m/s)')

#Box plot by Month
boxplot(m_ws~Month,df.agg, ylab = 'Wind Speed (m/s)')

### Wind Direction

windrose(speed = df.agg$m_ws,
                 direction =df.agg$m_wd,
                 speed_cuts = seq(0,10,2),
                 ggtheme='minimal')

#As TS
ggplot(df.agg, aes(x = Order, y = m_wd, color = Month)) +
  geom_line()

#General Distribution
hist(df.agg$m_wd, main = 'Histogram of Wind Direction', xlab = 'Wind Direction')

#Box plot by Month
boxplot(m_wd~Month,df.agg, ylab = 'Wind Direction')

### Air Tempreture

#As TS
ggplot(df.agg, aes(x = Order, y = m_at, color = Month)) +
  geom_line()

#General Distribution
hist(df.agg$m_at, main = 'Histogram of Air Tempreture', xlab = 'Wind Tempreture')

#Box plot by Month
boxplot(m_at~Month,df.agg, ylab = 'Wind Tempreture')

We see an extreme outlier in our air temperature dataset.

#Getting outliers at day to day level 
at.outliers <- boxplot(df.agg$m_at, plot=FALSE)$out
df.agg[which(df.agg$m_at %in% at.outliers),]
## # A tibble: 1 × 6
## # Groups:   Year [1]
##    Year Month  m_ws  m_at  m_wd Order
##   <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1  2019     5  5.93 -47.3  189.   101
#which(demo.data$score > quantile(demo.data$score)[4] + 1.5*IQR(demo.data$score)

We see the record from 2019 May that has an unusual air temperature at -47.25484.

hist(df[which(df$Year == 2019 & df$Month == 5),'AT'], xlab = 'Air Temperature', main= 'Daily Air Temperature from 2019 May')

We see an unusual pattern of distribution which have much data points at -99 air temperature. We consider them corrupted data and hence replace the monthly air temperature from 2019 May with the average of May air temperature from other years.

#New temp as the average of May air temperature from other years
new_at <- mean(unlist(df.agg[which(df.agg$Month == 5 & df.agg$Year != 2019),'m_at']))
df.agg[which(df.agg$m_at %in% at.outliers),'m_at'] <- new_at
df.agg$m_at
##   [1] -5.02968 -2.38464  1.91097  7.60433 12.23323 18.66533 24.18484 22.92194
##   [9] 16.44300 13.26774  7.07967  2.05355 -0.05935  0.53517  9.82194  9.01167
##  [17] 16.02065 21.93900 25.79710 22.61097 17.94200 10.78194  5.75400  1.90032
##  [25] -2.52161 -2.22107  0.24000  7.40633 13.11452 17.66467 22.08000 22.19000
##  [33] 19.30033 11.80742  3.57000 -4.33516 -7.68677 -7.24500 -0.23419  7.55100
##  [41] 13.95194 17.87167 20.26839 21.77677 17.13667 11.47484  1.38967  0.70387
##  [49] -4.28419 -7.77500  2.73258  8.01800 13.82194 17.17900 21.65871 21.53226
##  [57] 19.63967 13.19484  7.97800  3.72935 -1.82000 -0.03172  6.43161  8.70633
##  [65] 16.16548 21.56700 24.40774 24.11935 20.64033 14.77323  7.00100 -3.45355
##  [73]  0.31968  4.75857  3.43581 10.74067 14.89000 21.90700 22.51903 21.24645
##  [81] 20.79067 12.09387  5.40300 -5.37161 -1.50968  1.46250  1.67903  7.22100
##  [89] 15.22806 20.38200 23.53161 23.98839 18.96967 10.40548  1.02833  1.67194
##  [97] -5.25323 -2.81286  5.48710  8.50733 15.11629 19.77367 23.96355 21.83742
## [105] 18.42033  6.54484  2.41467  2.23613 -0.44774  0.88724  4.98323  7.31333
## [113] 17.01935 22.20033 23.84097 23.19097 16.75133 10.69387  5.18767 -0.13419
## [121] -3.66452  0.11571  8.14968  9.08567 18.71774 21.70367 23.86710 23.79516
## [129] 20.32833 11.95613  3.69700  3.91421

Checking the distribution of air temperature after taking out the outlier.

#As TS
ggplot(df.agg, aes(x = Order, y = m_at, color = Month)) +
  geom_line()

#General Distribution
hist(df.agg$m_at, main = 'Histogram of Air Tempreture', xlab = 'Wind Tempreture')

#Box plot by Month
boxplot(m_at~Month,df.agg, ylab = 'Wind Tempreture')

We see the outlier is removed while maintaining the original shape of distribution.

#write.csv(r, 'all_days.csv')
write.csv(df.agg, 'agg.csv')